# Calculate/load Differential Expression metrics for all genes, load SFARI dataset:
# Gene expression data
load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
# SFARI genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
mutate('ID' = ensembl_gene_id) %>%
dplyr::select(ID, `gene-score`, syndromic)
# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
if(!file.exists('./../working_data/genes_ASD_DE_info_vst.csv')) {
# Calculate differential expression for ASD
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
mutate('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID')
write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info_vst.csv')
rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info_vst.csv')
}
genes_DE_info = genes_DE_info %>% dplyr::select(ID, logFC, AveExpr, t, P.Value, adj.P.Val,
B, `gene-score`, syndromic) %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`))
datExpr = datExpr %>% data.frame
datExpr_backup = datExpr
rm(to_keep, datProbes)
Raw data
ASD samples seem to have higher gene expression consistently for all genes (~2)
datExpr_raw = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datExpr_raw = datExpr_raw[substr(rownames(datExpr_raw),1,3)=='ENS',
substring(colnames(datExpr_raw),2) %in% datMeta$Dissected_Sample_ID]
datExpr_raw = datExpr_raw[rowSums(datExpr_raw)>5,]
asd_samples = as.numeric(datMeta$Diagnosis_=='ASD')
ctl_samples = as.numeric(!asd_samples)
asd_ctl_mean = data.frame('ID' = as.character(rownames(datExpr_raw)),
'ASD_mean' = apply(datExpr_raw, 1, function(x) mean(x*asd_samples)),
'CTL_mean' = apply(datExpr_raw, 1, function(x) mean(x*(ctl_samples)))) %>%
left_join(SFARI_genes, by='ID') %>% mutate(`gene-score`=as.factor(`gene-score`))
p1 = ggplotly(asd_ctl_mean %>% ggplot(aes(x=ASD_mean, y=CTL_mean, color=`gene-score`)) +
scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + geom_point(alpha=0.4) +
geom_smooth(method=lm, se=FALSE, color='gray') +
ggtitle('Mean expression by gene ASD vs CTL') + theme_minimal())
p2 = ggplotly(asd_ctl_mean %>% dplyr::select(ID, ASD_mean, CTL_mean) %>% melt %>%
ggplot(aes(variable, value, fill=variable)) + geom_boxplot() +
coord_cartesian(ylim = c(0, 280)) + theme_minimal())
subplot(p1, p2, nrows=1)
remove(p1, p2, asd_ctl_mean)
VST normalised data
ASD samples seem to have higher gene expression consistently for all genes (~1.5)
asd_ctl_mean = data.frame('ID' = as.character(rownames(datExpr)),
'ASD_mean' = apply(datExpr, 1, function(x) mean(x*asd_samples)),
'CTL_mean' = apply(datExpr, 1, function(x) mean(x*(ctl_samples)))) %>%
left_join(SFARI_genes, by='ID') %>% mutate(`gene-score`=as.factor(`gene-score`))
p1 = ggplotly(asd_ctl_mean %>% ggplot(aes(x=ASD_mean, y=CTL_mean, color=`gene-score`)) +
geom_point(alpha=0.4) + geom_smooth(method=lm, se=FALSE, color='gray') +
ggtitle('Mean expression by gene ASD vs CTL') + theme_minimal())
p2 = ggplotly(asd_ctl_mean %>% dplyr::select(ID, ASD_mean, CTL_mean) %>% melt %>%
ggplot(aes(variable, value, fill=variable)) + geom_boxplot() + theme_minimal())
subplot(p1, p2, nrows=1)